In ‘Easy Geom Recipes’ the computation we did was within groups.

Grouping variables in ggplot2 are character, factor or logical variables.

If we use the compute_group parameter in our ggproto function to define our stat, computation will happen in a group-wise basis. In other words, our target layer geom_* will divide up our data set by any character, factor, or logical variables, and then do computation before returning a data frame for plotting.

This is not always the desired behavior. Sometimes computation has to happen across groups. The examples that follow are such cases.

First we’ll see a regression model that has an indicator variable as one of its inputs. We want the model to be computed holistically; geom_smooth(), which you may have used, computes groupwise; i.e. one model for each group. We’ll use panel-level computation to display across group model results.

The second example is circle packing. We can use the {circlepack} library to visualize quantities for entities. To wrap this work up into a geom function, we need the computation that positions the circles to happen at the panel level – it shouldn’t do the computation within groups.

Finally, we look at sf data. Here, I think it may be possible to use set compute_group or compute_panel and get similar visual results. However, computing and plotting a panel (we’ll do this with an inner join of an input data set and an sf reference data frame) is much faster than computing and group-wise. (just some guesses, need to try with sf w/ fewer geometries than fips.)

class: inverse, middle, center

Step 0: use base ggplot2 to get the job done


library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(palmerpenguins)

penguins <- remove_missing(penguins)
## Warning: Removed 11 rows containing missing values.
model <- lm(body_mass_g ~ flipper_length_mm + species, data = penguins)

penguins %>% 
  mutate(fitted = model$fitted.values) ->
penguins_w_fitted

penguins_w_fitted %>% 
  ggplot() + 
  aes(x = flipper_length_mm) + 
  aes(y = body_mass_g) + 
  aes(color = species) +
  geom_point() + 
  geom_line(aes(y = fitted, group = species))

Step 1: computation

compute_panel_ols_ind <- function(data, scales, formula = y ~ x + indicator) {

  model <- lm(formula = formula,
              data = data)

  data.frame(x = data$x,
             y = model$fitted.values,
             indicator = data$indicator,
             xend = data$x,
             yend = data$y)

}

Step 2: define ggproto

–

StatLmindicator <- ggplot2::ggproto("StatLmindicator",
                                      ggplot2::Stat,
                                      compute_panel = compute_panel_ols_ind,
                                      required_aes = c("x", "y", "indicator"),
                                      default_aes = ggplot2::aes(
                                        group = ggplot2::after_stat(indicator),
                                        color = indicator))

Step 3: define geom_* function

–

geom_lm_indicator <- function(mapping = NULL, data = NULL,
                                position = "identity", na.rm = FALSE,
                                show.legend = NA,
                                inherit.aes = TRUE, ...) {
  ggplot2::layer(
    stat = StatLmindicator, # proto object from Step 2
    geom = ggplot2::GeomLine, # inherit other behavior
    data = data,
    mapping = mapping,
    position = position,
    show.legend = show.legend,
    inherit.aes = inherit.aes,
    params = list(na.rm = na.rm, ...)
  )
}

Step 4: Enjoy! Use your function


ggplot(palmerpenguins::penguins) +
  aes(x = flipper_length_mm ) +
  aes(y = body_mass_g ) +
  geom_point() + 
  aes(color = species) +
  aes(indicator = species) +
  geom_lm_indicator()
## Warning: Removed 2 rows containing non-finite values (`stat_lmindicator()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).

ggplot(palmerpenguins::penguins) +
  aes(x = flipper_length_mm ) +
  aes(y = body_mass_g ) +
  geom_point() + 
  aes(color = species) +
  aes(indicator = species) +
  geom_lm_indicator(formula = y ~ x * indicator)
## Warning: Removed 2 rows containing non-finite values (`stat_lmindicator()`).
## Removed 2 rows containing missing values (`geom_point()`).

ggplot(palmerpenguins::penguins) +
  aes(x = flipper_length_mm ) +
  aes(y = body_mass_g ) +
  geom_point() + 
  aes(color = species) +
  aes(indicator = species) +
  geom_lm_indicator(formula = y ~ I(x^3) + I(x^2) + x + indicator)
## Warning: Removed 2 rows containing non-finite values (`stat_lmindicator()`).
## Removed 2 rows containing missing values (`geom_point()`).

Explain: What will the in-facet behavior be?

ggplot(palmerpenguins::penguins) +
  aes(x = flipper_length_mm ) +
  aes(y = body_mass_g ) +
  geom_point() + 
  aes(color = species) +
  aes(indicator = species) +
  geom_lm_indicator(formula = y ~ x + indicator) + 
  facet_wrap(~sex)
## Warning: Removed 2 rows containing non-finite values (`stat_lmindicator()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).

Step 0: Use base ggplot2 to get the job done

library(tidyverse)
library(gapminder)
gapminder %>%  
  filter(continent == "Americas") %>%  
  filter(year == 2002) %>%  
  select(country, pop) ->  
prep  

packcircles::circleProgressiveLayout(prep$pop,  
                                         sizetype = 'area') ->  
pack  

cbind(prep, pack) %>%
  mutate(id = row_number())
##                country       pop           x          y    radius id
## 1            Argentina  38331121  -3493.0180     0.0000 3493.0180  1
## 2              Bolivia   8445134   1639.5639     0.0000 1639.5639  2
## 3               Brazil 179914212   2732.7743 -9142.0260 7567.5936  3
## 4               Canada  31902268   1150.7522  4801.4069 3186.6608  4
## 5                Chile  15497046   5273.8171  1302.3806 2221.0049  5
## 6             Colombia  41008227  10562.3302 -1160.6508 3612.9384  6
## 7           Costa Rica   3834934  -4573.1172 -4469.2048 1104.8518  7
## 8                 Cuba  11226999  -7453.2262 -3646.6547 1890.4139  8
## 9   Dominican Republic   8650322  -8636.6596  -299.9554 1659.3622  9
## 10             Ecuador  12921234  -7908.2513  3314.7888 2028.0425 10
## 11         El Salvador   6353681  -4769.2005  4746.5765 1422.1250 11
## 12           Guatemala  11178650  -3084.4567  7593.9564 1886.3390 12
## 13               Haiti   7607651   -133.6785  9366.9797 1556.1460 13
## 14            Honduras   6677328   2869.9023  9116.0827 1457.8956 14
## 15             Jamaica   2664659   4409.2334  7302.3944  920.9708 15
## 16              Mexico 102479927  10986.7220  8154.0495 5711.4249 16
## 17           Nicaragua   5146848  -5728.2786 -6555.5701 1279.9580 17
## 18              Panama   2990875  -7982.0775 -6463.5726  975.7177 18
## 19            Paraguay   5884491  -6893.5704  6556.3419 1368.6094 19
## 20                Peru  26769436  -6631.4420 10836.0023 2919.0711 20
## 21         Puerto Rico   3859606  -2614.0309 10551.5165 1108.4001 21
## 22 Trinidad and Tobago   1101832  -1083.9386 11293.7595  592.2196 22
## 23       United States 287675526 -18526.1513 -6598.5236 9569.2196 23
## 24             Uruguay   3363085 -11041.9619   913.3993 1034.6512 24
## 25           Venezuela  24287670   4129.4010 13162.9819 2780.4686 25
pack %>%  
  packcircles::circleLayoutVertices(npoints = 50) ->  
circle_outlines  

circle_outlines %>%  
  ggplot() +  
  aes(x = x, y = y) +  
  geom_polygon(colour = "black", alpha = 0.6) +  
  aes(group = id) +  
  aes(fill = factor(id)) +  
  geom_text(data = cbind(prep, pack),  
            aes(x, y, size = pop, label = country,  
                group = NULL, fill = NULL)) +  
  theme(legend.position = "none") +  
  coord_equal()


Step 1: computation

# you won't use the scales argument, but ggplot will later
compute_panel_circle_pack <- function(data, scales){
 
  data %>%
    mutate(id = row_number()) ->
  data1
 
  if(is.null(data$area)){
    
    data1 %>% 
      mutate(area = 1) ->
    data1
    
  }
  
  data1 %>%  
    pull(area) %>%
    packcircles::circleProgressiveLayout(
                                         sizetype = 'area') %>%
    packcircles::circleLayoutVertices(npoints = 300) %>%
    left_join(data1) #%>%
    # rename(group = id)
   
}


# step 1b test the computation function
gapminder::gapminder %>%
  filter(continent == "Americas") %>%  
  filter(year == 2002) %>%  
  # input must have required aesthetic inputs as columns
  rename(area = pop) %>%
  compute_panel_circle_pack() %>%
  head()
## Joining with `by = join_by(id)`
##             x         y id   country continent year lifeExp     area gdpPercap
## 1   0.0000000   0.00000  1 Argentina  Americas 2002   74.34 38331121  8797.641
## 2  -0.7660766  73.15225  1 Argentina  Americas 2002   74.34 38331121  8797.641
## 3  -3.0639703 146.27241  1 Argentina  Americas 2002   74.34 38331121  8797.641
## 4  -6.8926731 219.32842  1 Argentina  Americas 2002   74.34 38331121  8797.641
## 5 -12.2505058 292.28821  1 Argentina  Americas 2002   74.34 38331121  8797.641
## 6 -19.1351181 365.11980  1 Argentina  Americas 2002   74.34 38331121  8797.641
gapminder::gapminder %>%
  filter(continent == "Americas") %>%  
  filter(year == 2002) %>%  
  # input must have required aesthetic inputs as columns
  rename(area = pop) %>%
  compute_panel_circle_pack() %>% 
  str()
## Joining with `by = join_by(id)`
## 'data.frame':    7525 obs. of  9 variables:
##  $ x        : num  0 -0.766 -3.064 -6.893 -12.251 ...
##  $ y        : num  0 73.2 146.3 219.3 292.3 ...
##  $ id       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ year     : int  2002 2002 2002 2002 2002 2002 2002 2002 2002 2002 ...
##  $ lifeExp  : num  74.3 74.3 74.3 74.3 74.3 ...
##  $ area     : int  38331121 38331121 38331121 38331121 38331121 38331121 38331121 38331121 38331121 38331121 ...
##  $ gdpPercap: num  8798 8798 8798 8798 8798 ...
# step 1b test the computation function
gapminder::gapminder %>%
  filter(continent == "Americas") %>%  
  filter(year == 2002) %>%  
  # input must have required aesthetic inputs as columns
  rename(area = pop) %>%
  compute_panel_circle_pack() %>% 
  ggplot() + 
  aes(x = x, y = y, fill = country) + 
  geom_polygon()
## Joining with `by = join_by(id)`

my_setup_data <- function(data, params){
                                    if(data$group[1] == -1){
                                      nrows <- nrow(data)
                                      data$group <- seq_len(nrows)
                                    }
                                    data
                                  }

Step 2: define ggproto

StatCirclepack <- ggplot2::ggproto(`_class` = "StatCirclepack",
                                  `_inherit` = ggplot2::Stat,
                                  required_aes = c("id"),
                                  compute_panel = compute_panel_circle_pack#,
                                  # setup_data = my_setup_data,
                                  # default_aes = aes(fill = after_stat(area))
                                  )

Step 3: define geom_* function

geom_polygon_circlepack <- function(mapping = NULL, data = NULL,
                           position = "identity", na.rm = FALSE,
                           show.legend = NA,
                           inherit.aes = TRUE, ...) {
  ggplot2::layer(
    stat = StatCirclepack, # proto object from Step 2
    geom = ggplot2::GeomPoint, # inherit other behavior
    data = data,
    mapping = mapping,
    position = position,
    show.legend = show.legend,
    inherit.aes = inherit.aes,
    params = list(na.rm = na.rm, ...)
  )
}

Step 4: Enjoy! Use your function

gapminder::gapminder %>%
  filter(year == 2002) %>%
  ggplot() +
  aes(id = country) + 
  geom_polygon_circlepack(alpha = .5, size = .002)
## Joining with `by = join_by(id)`

last_plot() + 
  aes(color = continent)
## Joining with `by = join_by(id)`

last_plot() + 
  aes(area = pop)
## Joining with `by = join_by(id)`

last_plot() +
  aes(color = continent) +
  facet_wrap(facets = vars(continent)) 
## Joining with `by = join_by(id)`
## Joining with `by = join_by(id)`
## Joining with `by = join_by(id)`
## Joining with `by = join_by(id)`
## Joining with `by = join_by(id)`


Step 0: get it done with base ggplot2

Example simple feature to data frame…

library(tidyverse)
library(sf)
## Linking to GEOS 3.10.2, GDAL 3.4.2, PROJ 8.2.1; sf_use_s2() is TRUE
fips_geometries <- readRDS(url("https://wilkelab.org/SDS375/datasets/US_counties.rds")) %>%
  rename(FIPS = GEOID)

US_census <- read_csv("https://wilkelab.org/SDS375/datasets/US_census.csv",
                      col_types = cols(FIPS = "c")
          )

# from Claus Wilke on ggplot2

fips_geometries %>%
  left_join(US_census, by = "FIPS") %>%
  ggplot() +
  geom_sf(aes(fill = mean_work_travel), linewidth = .1) + 
  scale_fill_viridis_c(option = "magma") ->
classic_plot_sf_layer

classic_plot_sf_layer

Step 1: computation

layer_data(classic_plot_sf_layer) %>% 
  select(geometry, xmin, xmax, ymin, ymax) %>% 
  bind_cols(tibble(FIPS = fips_geometries$FIPS)) %>% 
  rename(fips = FIPS) ->
reference

compute_panel_county <- function(data, scales){
 
  data %>% 
    # inner_join(fips_ggplot2_reference) 
    inner_join(reference, multiple = "all") %>% 
    mutate(group = -1)

}

Step 2: pass to ggproto

StatCounty <- ggplot2::ggproto(`_class` = "StatCounty",
                                  `_inherit` = ggplot2::Stat,
                                  # required_aes = c("fips"),
                                  # setup_data = my_setup_data,
                                  compute_panel = compute_panel_county,
                                  default_aes = aes(geometry = after_stat(geometry))
                                  )

Step 3: write geom_* function

geom_sf_county <- function(
  mapping = NULL,
  data = NULL,
  position = "identity",
  na.rm = FALSE,
  show.legend = NA,
  inherit.aes = TRUE, ...) {
  ggplot2::layer(
    stat = StatCounty,  # proto object from step 2
    geom = ggplot2::GeomSf,  # inherit other behavior
    data = data,
    mapping = mapping,
    position = position,
    show.legend = show.legend,
    inherit.aes = inherit.aes,
    params = list(na.rm = na.rm, ...)
  )
}

step 4: test geom_* function, celebrate!

read_csv("https://wilkelab.org/SDS375/datasets/US_census.csv",
                      col_types = cols(FIPS = "c")) %>%  
  ggplot() + 
  aes(fips = FIPS) +
  geom_sf_county(linewidth = .02, 
                      color = "darkgrey") +
  aes(fill = mean_work_travel) + 
  coord_sf() +
  scale_fill_viridis_c(option = "magma")
## Joining with `by = join_by(fips)`